---
title: "geodata"
author: "DY"
date: "6/22/2022"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


## 1.1. General 

```{r, include=FALSE, fig.width=6, fig.height=5, results='asis'}
# remove all objects
rm(list=ls())

# Unload all packages 


# Add packages #not all are in use
pacman::p_load(
  here,
  dplyr,
  tidyverse, #dplyr, readr, etc.
  data.table, #fread() 
  foreign, #load data types including stata .dta files 
  magrittr, #%<>% operator
  skimr, #for summerising
  lubridate, #dates
  knitr,
  kableExtra,
  janitor,
  ggplot2,
  hablar,
  gplots,
  multcomp,
  broom,
  sjPlot,
  sjlabelled,
  sjmisc
)
```

```{r, include=F}

here::i_am("poq/poq_margins_descriptives.Rmd")
load(here("poq/data/da_clean.RData"))
load(here("poq/data/vote_zip_crime_w.RData"))
load(here("poq/data/igs_clean.RData"))
load(here("poq/data/lucid_edit.RData"))
#remove non attentive respondents
lucid_edit %<>%
  mutate(duration_in_seconds=as.numeric(duration_in_seconds)) %>%
  filter(duration_in_seconds>180 | q389_1==100)

# create "progressives" dfs
da_prog <- subset(da_edit, prog >= 0.5)
igs_prog <- subset(igs_edit, prog >= 0.5)


# separate "Recallers" from non
da_recall <- subset(da_edit, out_vote == 1)
da_no <- subset(da_edit, out_vote == 0)

#create LA df
igs_la <- subset(igs_edit, county == "los angeles")




highlight <- subset(da_edit, (prog1 == 1 | prog2 == 1)  & out_vote == 1) #progressive voters who still voted for the recall


highlight %<>% mutate(out_vote = "Conflicted Progressives")

highlight_policy <- subset(vote_zip_crime_w, out2 >= 0.5 & out_vote == 1) # favored at least half the policies
highlight_policy %<>% mutate(out_vote = "Progressive Recallers")


comp_group <- subset(vote_zip_crime_w, (prog1 == 1 | prog2 == 1) & out_vote == 0) #voters who are progs and voted against the recall


#for table 1 comparison:
only_progs <- subset(da_edit, prog1 == 1 | prog2 == 1)
da_recall %<>% mutate(prog_binary = case_when(
  prog1 == 1 | prog2 == 1 ~ 1,
  TRUE ~ 0
))

```


# Table 3, 4 
```{r}
only_progs %<>%
  mutate(zip = as_factor(zip)) %>%
  mutate(out_vote = as_factor(out_vote))
# table1

# function for diff in means calculation for sample difference between groups
pvalue <- function(x, ...) {
    # Construct vectors of data y, and groups (strata) g
    y <- unlist(x)
    g <- factor(rep(1:length(x), times=sapply(x, length)))
    if (is.numeric(y)) {
        # For numeric variables, perform a standard 2-sample t-test
        p <- t.test(y ~ g)$p.value
    } else {
        # For categorical variables, perform a chi-squared test of independence
        p <- chisq.test(table(y, g))$p.value
    }
    # Format the p-value, using an HTML entity for the less-than sign.
    # The initial empty string places the output on the line below the variable label.
    c("", sub("<", "&lt;", format.pval(p, digits=3, eps=0.001)))
}


## w/out moderators, without missing values

table_1_prog2prog <- table1::table1(~ sal + redm+puni_t + victim+ sym+rr | out_vote, data=only_progs, overall=F, render.missing=NULL, render.categorical="FREQ (PCTnoNA%)", export = "sample.csv", extra.col=list(`P-value`=pvalue))#

table_1_prog2recall <- table1::table1(~ sal + redm+puni_t + victim+ sym+rr | prog_binary, data=da_recall, overall=F, render.missing=NULL, render.categorical="FREQ (PCTnoNA%)", export = "sample.csv", extra.col=list(`P-value`=pvalue))#

table_1_prog2recall
table_1_prog2prog

# export to DOCX
library(flextable)
library(table1)
t1flex(table_1_prog2prog) %>% 
  save_as_docx(path="comparison_progs2progs.docx")
t1flex(table_1_prog2recall) %>% 
  save_as_docx(path="comparison_progs2recall.docx")


```


# Table 2
```{r}

#Alexander's advice: A table with three columns: 
#first column is bivariate results for existing 9 vars you have (from 9 separate models),
#Column 1 and 2 results will particularly be valuable for your claims about racial attitude (non-)effects.
crime_rate <- estimatr::lm_robust(out_vote ~ log(crime_rate_2020), 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
crime_rate_change <- estimatr::lm_robust(out_vote ~ crime_diff_log, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
salience <- estimatr::lm_robust(out_vote ~ sal, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
redm <- estimatr::lm_robust(out_vote ~ redm, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
punitive <- estimatr::lm_robust(out_vote ~ puni_t, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
progressive <- estimatr::lm_robust(out_vote ~ prog, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
victim <- estimatr::lm_robust(out_vote ~ victim, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
knoweldge <- estimatr::lm_robust(out_vote ~ know, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
rr <- estimatr::lm_robust(out_vote ~ rr, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
sym <- estimatr::lm_robust(out_vote ~ sym, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)

#second column is results for the 9 vars but with demographic controls only (from 9 separate models)

crime_rate_dem <- estimatr::lm_robust(out_vote ~ log(crime_rate_2020)
                                       + gndr+idgy+race+pid+home+incm+age_new + edu, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
crime_rate_change_dem <- estimatr::lm_robust(out_vote ~ crime_diff_log
                                             + gndr+idgy+race+pid+home+incm+age_new + edu, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
salience_dem <- estimatr::lm_robust(out_vote ~ sal
                                     + gndr+idgy+race+pid+home+incm+age_new + edu, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
redm_dem <- estimatr::lm_robust(out_vote ~ redm
                                 + gndr+idgy+race+pid+home+incm+age_new + edu, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
punitive_dem <- estimatr::lm_robust(out_vote ~ puni_t
                                     + gndr+idgy+race+pid+home+incm+age_new + edu, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
progressive_dem <- estimatr::lm_robust(out_vote ~ prog
                                        + gndr+idgy+race+pid+home+incm+age_new + edu, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
victim_dem <- estimatr::lm_robust(out_vote ~ victim
                                   + gndr+idgy+race+pid+home+incm+age_new + edu, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
knoweldge_dem <- estimatr::lm_robust(out_vote ~ know
                                      + gndr+idgy+race+pid+home+incm+age_new + edu, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
rr_dem <- estimatr::lm_robust(out_vote ~ rr
                               + gndr+idgy+race+pid+home+incm+age_new + edu, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)
sym_dem <- estimatr::lm_robust(out_vote ~ sym
                                + gndr+idgy+race+pid+home+incm+age_new + edu, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)

models1 <- list(crime_rate_dem, crime_rate_change_dem, salience_dem, redm_dem, punitive_dem, progressive_dem)
models2 <- list(victim_dem, knoweldge_dem, rr_dem, sym_dem)



# Save table to a file
texreg::texreg(models1,
       custom.model.names = c("Crime Rate (log)", "Increase in Crime Rate", "Salience of Crime", "Redeemability belief", "Punitive Sentiment", "Progressive Sentimen"),
       longtable = TRUE,
       use.packages = FALSE,
       file = "regression_results1.tex")

texreg::texreg(models2,
       custom.model.names = c("Crime Victim?", "Crime Politics Knowledge", "Racial Resentment", "Racial Sympathy"),
       longtable = TRUE,
       use.packages = FALSE,
       file = "regression_results2.tex")


#third column is results for all 9 vars and controls (all in 1 model). Column 2 might give you your cleanest picture, though ideally things are consistent across the columns.



#using the raking weights
#using log of crime rate
olsvote_all <- estimatr::lm_robust(out_vote ~ 
                               log(crime_rate_2020) + sal + crime_diff_log
                               + gndr+idgy+race+pid+home+incm+age_new + edu
                               + redm+puni_t + prog
                               + victim+know
                               + sym+rr, 
                               data = vote_zip_crime_w,
                               weights = raking, 
                               clusters = response_id)

texreg::texreg(olsvote_all,
       longtable = TRUE,
       use.packages = FALSE,
       file = "regression_results3.tex")


olsvote_s <- estimatr::lm_robust(out_vote ~ 
                               log(crime_rate_2020) +sal+ victim
                               + redm + puni_t + prog
                               + sym + rr
                               
                               , 
                               data = vote_zip_crime_w,
                               weights = raking,
                               clusters = response_id)

```

#Figure E.1
```{r}

#plot_model(olsvote_demo, sort.est = T) #print only demographics without weird categories!!
plot_model(olsvote_all, sort.est = T,
           terms = c("sal", "puni_t", "race [Asian, Hispanic, Black]", "gndr [Man]", "pid [Democrat]", "home [I own a home]", "rr", "victim", "log(crime_rate_2020)","crime_diff_log", "age_new", "redm", "sym", "prog", "know"),
           title = "Predictors of recall support", 
           axis.labels = c("Progressive Sentiment",
                           "Black identified",
                           "Home Owner",
                           "Crime Politics knowledge",
                           "Racial Sympathy",
                           "Crime victim?",
                           "Hispanic Identified",
                           "Age",
                           "Redeemability Belief",
                           "Man Identified",
                           "Relative Crime Rate",
                           "Democrat Identified",
                           "Racial Resentment",
                           "Zip code crime Rate Increase",
                           "Asian Identified",
                           "Punitive Sentiment",
                           "Salience of Crime")
           )
ggsave("preds_long.png", width = 6.5, height = 8, units = "in")


```



# sample
#exit poll
```{r}



da_edit %<>%
  mutate(zip = as_factor(zip))
# table1

# function for diff in means calculation for sample difference between groups
pvalue <- function(x, ...) {
    # Construct vectors of data y, and groups (strata) g
    y <- unlist(x)
    g <- factor(rep(1:length(x), times=sapply(x, length)))
    if (is.numeric(y)) {
        # For numeric variables, perform a standard 2-sample t-test
        p <- t.test(y ~ g)$p.value
    } else {
        # For categorical variables, perform a chi-squared test of independence
        p <- chisq.test(table(y, g))$p.value
    }
    # Format the p-value, using an HTML entity for the less-than sign.
    # The initial empty string places the output on the line below the variable label.
    c("", sub("<", "&lt;", format.pval(p, digits=3, eps=0.001)))
}


## w/out moderators, without missing values

table_1 <- table1::table1(~age_new + gndr +idgy+pid+race+home+edu + incm | out_vote, data=da_edit, overall=T, render.missing=NULL, render.categorical="FREQ (PCTnoNA%)", export = "sample.csv")#,extra.col=list(`P-value`=pvalue))
table_1

# export to DOCX
t1flex(table_1) %>% 
  save_as_docx(path="table1.docx")

library(papaja)

# Restructure object
x <- attr(table_1, "obj")$contents
names(x) <- lapply(x, function(x){rownames(x)[[1L]]})
x <- lapply(x, function(x){x[-1L, ]})

# Use apa_table() for output to pdf in knit
apa_table(x, caption = "Output from table1 in a pdf document.")
```

#lucid
```{r}
lucid_sample <- dplyr::select(lucid_edit, age,gender,idgy,political_party,ethnicity,home,education,hhi) %>%
  mutate(hhi=as.numeric(hhi))
lucid_sample %<>%
  rename(Age = age) %>%
  rename(Gender = gender) %>%
  rename(Political_Ideology = idgy) %>%
  rename(Partisanship = political_party) %>%
  rename(Ethnicity = ethnicity) %>%
  rename(Home_owner = home) %>%
  rename(Education = education) %>%
  rename(Income = hhi) %>%
  mutate(
    Gender = case_when(
      Gender == 1 ~ "Male",
      Gender == 2 ~ "Female"
    ),
    Partisanship = case_when(
      Partisanship == 1 | Partisanship == 2 | Partisanship == 3 | Partisanship == 6 ~ "Democrat",
      Partisanship == 5 | Partisanship == 8 | Partisanship == 9 | Partisanship == 10 ~ "Republican",
      TRUE ~ "Independent/Other"), # maybe code leaning as independent
    Ethnicity = case_when(
      Ethnicity == 1 ~ "White",
      Ethnicity == 2 ~ "Black",
      Ethnicity == 3 ~ "Native American",
      Ethnicity == 4 | Ethnicity == 5 | Ethnicity == 6 | Ethnicity == 7 | Ethnicity == 8 |Ethnicity == 9| Ethnicity == 10 ~ "Asian",
      Ethnicity == 11 | Ethnicity == 12 | Ethnicity == 13 | Ethnicity == 14 ~ "Pacific Islander",
      TRUE~"Other"),
    Education = case_when(
      Education == 1 ~"Some high school or less",
      Education == 2 ~"High school graduate",
      Education == 3 ~"Post high school vocational training",
      Education == 4 ~"Some college, but no degree",
      Education == 5 ~"Associate's degree",
      Education == 6 ~"Bachelor's degree",
      Education == 7 ~"Master's or professional degree",
      Education == 8 ~"Doctorate degree",
      TRUE~"Other"),
    Income = case_when(
      Income <=2 ~"$0-$19,999",
      Income >2 & Income <7 ~ "$20,000-$39,999",
      Income >6 & Income <13 ~ "$40,000-$69,999",
      Income >12 & Income <19 ~ "$70,000-$99,999",
      Income >18 & Income <21 ~ "$100,000-$149,999",
      Income >20 & Income <25 ~ "$150,000+",
      TRUE~"Other")
  )
table_2_tmp <- table1::table1(~age+gender+idgy+political_party+ethnicity+home+education+hhi, data=lucid_edit,  render.missing=NULL, render.categorical="FREQ (PCTnoNA%)", export = "sample_lucid.csv")#,extra.col=list(`P-value`=pvalue)), overall=T,
table_2_tmp

table_2 <- table1::table1(~Age+Gender+Political_Ideology+Partisanship+Ethnicity+Home_owner+Education+Income, data=lucid_sample,  render.missing=NULL, render.categorical="FREQ (PCTnoNA%)", export = "sample_lucid2.csv")#,extra.col=list(`P-value`=pvalue)), overall=T,
table_2


# export to DOCX
t1flex(table_2) %>% 
  save_as_docx(path="table2.docx")


```



# Figure C.1

```{r, results='asis'}
# counties by progressive scale
plotmeans(prog ~ county, data = subset(igs_edit, county == "san francisco" | county == "los angeles" | county == "alameda" | county == "san diego" | county == "orange" | county == "riverside" | county == "san bernardino"))
counties <- subset(igs_edit, county == "san francisco" | county == "los angeles" | county == "alameda" | county == "san diego" | county == "orange" | county == "riverside" | county == "san bernardino" | county == "lassen" | county == "modoc")

ggplot(subset(counties, !is.nan(prog))) + 
  aes(x= reorder(county, -prog), fill = as.factor(prog)) +
  geom_bar(position = "fill") +
  theme_538()+
  xlab("CA Counties") +
  ylab("Progressive Scale 
       (Proportion of Respondents)") +
  # geom_text(aes(x = county, 
  #               label = scales::percent(after_stat(count / tapply(count, x, sum)[x])), 
  #               group = prog), position = "fill", stat = "count")
   coord_flip() +
  scale_fill_viridis_d(option = "inferno", name = "Penal Progressive", labels=c("Not Progressive", "Mostly Not Progressive", "Neither", "Mostly Progressive", "Progressive"))
  
ggsave("counties.png", width = 11, height = 8, units = "in")

```



#create dataframes for text analysis

```{r}
text <- dplyr::select(lucid_edit, why_1, why_2, why_3, group1, out1)
text %<>% mutate(why = case_when(
  !is.na(why_1) ~ why_1,
  !is.na(why_2) ~ why_2,
  !is.na(why_3) ~ why_3
)) %>%
  dplyr::select(why,group1,out1) %>%
  na.omit()
```

#using the text pckg for ML text summarization
```{r}

# install.packages("text")
# install.packages("reticulate")
# reticulate::install_miniconda(force=TRUE, update=FALSE)
# text::textrpp_install()
#text::textrpp_initialize()
library(reticulate)
library(text)
library(sentencepiece)


#reticulate::conda_list()
#use_condaenv("textrpp_condaenv")
# Initialize the installed conda environment
#textrpp_initialize(save_profile = TRUE) #save_profile = TRUE saves the settings so that you do not have to run textrpp_initialize() again after restarting R
##

#models:
#https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct.

# https://huggingface.co/sshleifer/distilbart-cnn-12-6?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct.


opp_less <- subset(text, group1=="do less" & out1 == 0)
opp_less<-toString(opp_less$why)
sum_oppose_less <- data.frame(matrix(nrow=1,ncol=4,dimnames = list(c("sum"),c("bart", "t5", "FB", "booksum"))))

sum_oppose_less$bart <- textSum(opp_less, max_length = 300L, min_length = 50L, model = "philschmid/bart-large-cnn-samsum")$sum_x
sum_oppose_less$t5 <- textSum(opp_less, max_length = 300L, min_length = 50L, model = "philschmid/flan-t5-base-samsum")$sum_x
sum_oppose_less$FB <- textSum(opp_less, max_length = 300L, min_length = 50L, model = "facebook/bart-large-cnn")$sum_x
sum_oppose_less$booksum <- textSum(opp_less, max_length = 300L, min_length = 50L, model = "pszemraj/led-large-book-summary")$sum_x


opp_diff <- subset(text, group1=="do diff" & out1 == 0)
opp_diff<-toString(opp_diff$why)
sum_oppose_diff <- data.frame(matrix(nrow=1,ncol=4,dimnames = list(c("sum"),c("bart", "t5", "FB", "booksum"))))


sum_oppose_diff$bart <- textSum(opp_diff, max_length = 300L, min_length = 50L, model = "philschmid/bart-large-cnn-samsum")$sum_x
sum_oppose_diff$t5 <- textSum(opp_diff, max_length = 300L, min_length = 50L, model = "philschmid/flan-t5-base-samsum")$sum_x
sum_oppose_diff$FB <- textSum(opp_diff, max_length = 300L, min_length = 50L, model = "facebook/bart-large-cnn")$sum_x
sum_oppose_diff$booksum <- textSum(opp_diff, max_length = 300L, min_length = 50L, model = "pszemraj/led-large-book-summary")$sum_x

## do more 
opp_more <- subset(text, group1=="do more" & out1 == 0)
opp_more<-toString(opp_more$why)

sum_oppose_more <- data.frame(matrix(nrow=1,ncol=3,dimnames = list(c("sum"),c("USE", "FB", "booksum"))))
sum_oppose_more$USE <- textSum(opp_more, max_length = 300L, min_length = 50L, model = "philschmid/flan-t5-base-samsum")$sum_x 
#sum_oppose_more$FB <- textSum(opp_more, max_length = 300L, min_length = 50L, model = "facebook/bart-large-cnn")$sum_x 
sum_oppose_more$booksum <- textSum(opp_more, max_length = 300L, min_length = 50L, model = "pszemraj/led-large-book-summary")$sum_x 
```


# Figure 8, 9
```{r}
require(quanteda)
require(quanteda.textstats)
require(quanteda.textplots)
library(tm)



# compare in each treatment the people in favor and the people against
do_less <- subset(text, group1=="do less")
do_less<- corpus(do_less, text_field = "why")
do_less <- tokens(do_less, remove_punct = TRUE)
do_less <- tokens_remove(do_less, c("i", "the", "as", "if", "do", "get", "from", "put", "them", "as", "non", "not", "of", "and", "in", "of", "that", "can", "so", "to", "when", "you", "idk", "don't", "be", "no"))
do_less <- dfm(do_less)

tstat_key_do_less <- textstat_keyness(do_less, 
                              target = do_less$out1 >= 1)
keyness_less <- textplot_keyness(tstat_key_do_less)
keyness_less + scale_color_manual(values = c("lightblue", "grey"),
  labels = c("Support reducing the extent", "oppose reducing the extent"))+ labs(color = "")
ggsave("keyness_less.png", width = 10, height = 7.5, units = "in")  


do_diff <- subset(text, group1=="do diff")
do_diff<- corpus(do_diff, text_field = "why")
do_diff <- tokens(do_diff, remove_punct = TRUE)
do_diff <- tokens_remove(do_diff, c("i", "the", "as", "if", "do", "get", "from", "put", "them", "as", "non", "not", "of", "and", "in", "of", "that", "can", "so", "to", "when", "you", "idk", "don't", "be", "no", "with", "a", "but", "too", "than", "who", "have", "has", "are", "see", "they", "someone", "him", "on"))
do_diff <- dfm(do_diff)

tstat_key_do_diff <- textstat_keyness(do_diff, 
                              target = do_diff$out1 >= 1)
keyness_diff<-textplot_keyness(tstat_key_do_diff)
keyness_diff + scale_color_manual(values = c("lightblue", "grey"),
  labels = c("Support reduce intensity", "oppose reduce intensity"))+ labs(color = "")
ggsave("keyness_diff.png", width = 10, height = 7.5, units = "in")  



do_more <- subset(text, group1=="do more")
do_more<- corpus(do_more, text_field = "why")
do_more <- tokens(do_more, remove_punct = TRUE)
do_more <- tokens_remove(do_more, c("i", "the", "as", "if", "do", "get", "from", "put", "them", "as", "non", "not", "of"))
do_more <- dfm(do_more)

tstat_key_do_more <- textstat_keyness(do_more, 
                              target = do_more$out1 >= 1)
keyness_more <- textplot_keyness(tstat_key_do_more)
keyness_more + scale_color_manual(values = c("lightblue", "grey"),
  labels = c("Support get-tough", "oppose get-tough"))+ labs(color = "")
ggsave("keyness_more.png", width = 10, height = 7.5, units = "in")  


```